from IPython.display import HTML
HTML('''<script>
function code_toggle() {
if (code_shown){
$('div.input').hide('500');
$('#toggleButton').val('Show Code')
} else {
$('div.input').show('500');
$('#toggleButton').val('Hide Code')
}
code_shown = !code_shown
}
$( document ).ready(function(){
code_shown=false;
$('div.input').hide()
});
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''')

StackOverflow is a platform owned by the StackExchange Network where students, professionals, and enthusiasts exchange information regarding programming through posts, queries, and comments. It also serves as a platform for programmers and businesses to collaborate and work on projects.
With roughly 18 million posted questions, it is a comprehensive network of programmers across the field. Given this wide network of programmers, StackOverflow has been the ‘go to’ website for programmers to clarify and further understand programming algorithms and its syntax

The general framework of the study is as follows:

Data gathering
Two-step clustering
#preliminaries
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from wordcloud import WordCloud
import requests
import time
from bs4 import BeautifulSoup
import sqlite3
import seaborn as sns
import re
import os
from functools import reduce
import psycopg2
from xml.etree import ElementTree
import random
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from IPython.display import clear_output
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from scipy.spatial.distance import euclidean, cityblock
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score,
calinski_harabaz_score, silhouette_score)
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import sqlite3
from matplotlib.colors import LinearSegmentedColormap
%matplotlib inline

def FiletoDBLoad(filename, tablename, conn):
if tablename == 'Posts':
!echo "<?xml version=\"1.0\" encoding=\"utf-8\"?>" > .temp.txt
!echo "<posts>" >> .temp.txt
!cat {filename} |grep "PostTypeId=\"1\"" >> .temp.txt
!echo "</posts>" >> .temp.txt
filename = '.temp.txt'
with open(filename, 'r', encoding='utf-8') as f:
header = f.read().split('\n')[2]
header = re.findall(r'(\S*?)=',header)
tree = ElementTree.parse(filename)
xml_data = tree.getroot()
table = []
for elem in xml_data:
row = {}
for j in header:
try:
col = str(f'{header.index(j):02d}')+'_'+j
row[col] = elem.attrib[j]
except:
continue
table.append(row)
df_table = pd.DataFrame(table)
df_table.columns = [i[3:] for i in df_table.columns]
df_table.to_sql(tablename, con=conn, if_exists='append', index=False)
conn = sqlite3.connect('/mnt/processed/private/msds2020/lt13/database/stackoverflow.db')
cur = conn.cursor()
for f in files:
FiletoDBLoad(filename=f,tablename='Posts',conn=conn)

# Proxy servers for web scraping
os.environ['HTTP_PROXY'] = 'http://165.22.32.8:80'
os.environ['HTTPS_PROXY'] = 'https://66.70.167.122:3128'
req_header = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: max-age=0
cookie: _ga=GA1.2.1223690127.1554361835; __qca=P0-1087024619-1554361834541; __gads=ID=95575f8f21b13b6a:T=1554361834:S=ALNI_MYzrAP4MO9xO3-RuodMxFKsUvXijA; notice-ctt=4%3B1554364020909; prov=5a1477ef-3607-2d87-c83b-4dadc0666ec0; _gid=GA1.2.229546230.1560849365; acct=t=sHvGyo1FYK9EJrJ2RTE92i%2bijUmMLbgd&s=ODXb%2b%2f3uMcMKfe3z1cbrgAlLUsX%2bAdVM; _gat=1
referer: https://stackoverflow.com/
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'''
headers = {i.lower(): j for i,j in re.findall(r'(.*?): (.*?)$', req_header, re.M)} #re.M --> multiline
url = 'https://stackoverflow.com/?tab=month'
resp = requests.get(url, headers = headers)
resp_soup = BeautifulSoup(resp.text, 'lxml')
links = [i.get('href')
for i in resp_soup.select('div.summary a.question-hyperlink')]
N_link = len(links)
data = []
failed_links = []
count = 0
t = 2
while links:
link = links[0]
url_2 = 'https://stackoverflow.com/' + link
while True:
try:
resp_2 = requests.get(url_2, headers=headers)
if resp_2.status_code == 200:
break
else:
print("Red status code")
time.sleep(5)
except Exception as e:
print("Pause for 5 seconds before requesting again")
time.sleep(5)
resp_soup2 = BeautifulSoup(resp_2.text, 'lxml')
title = resp_soup2.select_one('div#question-header > h1 > a').text
tags = [i.text for i in resp_soup2.select(
'div.post-taglist > div > a')]
tags = ''.join([f'<{i}>' for i in tags])
# post id
row_ID = int(re.findall(r'/questions/(\d+)/', link)[0])
post_type_ID = 1
accepted_ans_ID = resp_soup2.findAll(
'div', {'itemprop': 'acceptedAnswer'})
if accepted_ans_ID:
accepted_ans_ID = accepted_ans_ID[0]['data-answerid']
else:
accepted_ans_ID = ''
# post statistics
question_stats = resp_soup2.select('div.module.question-stats b')
creation_date = question_stats[0].time['datetime']
view_count = int(re.findall(
r'[0-9,]+', question_stats[1].text)[0].replace(',', ''))
last_act_date = ''
if len(question_stats) == 3:
last_act_date = question_stats[2].a['title'].replace(
' ', 'T').replace('Z', '')
#score and body
score = int(resp_soup2.select_one(
'div.js-vote-count.grid--cell.fc-black-500.fs-title.grid.fd-column.ai-center').text)
body = ''.join(list(map(str, resp_soup2.select('div.post-text p'))))
# user id and editor id
user_dets_main = resp_soup2.select_one(
'div.mt16.pt4.grid.gs8.gsy.fw-wrap.jc-end.ai-start')
user_dets = user_dets_main.select('div.user-details')
owner_ID = re.findall(r'/users/(\d+)/', user_dets[-1].a['href'])[0]
if len(user_dets) == 2:
edit_date = resp_soup2.select(
'div.question div.user-action-time')[0].span['title'].replace(' ', 'T').replace('Z', '')
if user_dets[0].has_attr('itemprop'):
editor_ID = owner_ID
else:
try:
editor_ID = re.findall(
r'/users/(\d+)/', user_dets[0].a['href'])[0]
except TypeError:
editor_ID = ''
else:
edit_date = ''
editor_ID = ''
# answer and comment count
ans_count = int(resp_soup2.select_one(
'div.subheader.answers-subheader').h2['data-answercount'])
comm = resp_soup2.select('div.question div.comments ul')
if comm[0]['class'][0] == 'close-as-off-topic-status-list':
comm.pop(0)
comm_hidden = int(comm[0][
'data-remaining-comments-count'])
comm_shown = len(resp_soup2.select('div.question span.comment-copy'))
comm_count = comm_hidden + comm_shown
data.append((row_ID, post_type_ID, accepted_ans_ID, creation_date, score,
view_count, body, owner_ID, editor_ID, edit_date,
last_act_date, title, tags, ans_count, comm_count))
time.sleep(t)
clear_output()
count+=1
print(f'{100 * (N_link) / 120:.2f}% done')
links.remove(link)
df = pd.DataFrame(data, columns=['row Id', 'PostTypeId', 'AcceptedAnswerId',
'CreationDate', 'Score', 'ViewCount', 'Body',
'OwnerUserId', 'LastEditorUserId', 'LastEditDate',
'LastActivityDate', 'Title', 'Tags', 'AnswerCount',
'CommentCount']).astype(str)
df.head()

max_id = 56412356
stop_words = stopwords.words('english') + ['python', 'using', 'use']
for trial_no in range(20):
print(trial_no)
idxs = list(range(max_id//1000))
posts = []
while len(posts) < 1e4:
idx = random.choice(idxs)
cur.execute(f"""SELECT DISTINCT * FROM PostsQuestions
WHERE tags LIKE "%python%"
AND AcceptedAnswerId IS NOT NULL
AND Id BETWEEN {idx*1000} AND {idx*1000+999}
AND Id LIKE '{idx}___'""")
posts.extend(cur.fetchall())
idxs.remove(idx)
tags = list(zip(*posts))
ids, titles = tags[0], tags[11]
tfidf_vectorizer = TfidfVectorizer(token_pattern='\w{2,}',
stop_words=stop_words,
min_df=0.001,
binary=True)
bow = tfidf_vectorizer.fit_transform(titles)
V = np.cov(bow.toarray(), rowvar=False)
lambdas, w = np.linalg.eig(V)
indices = np.argsort(lambdas)[::-1]
lambdas = lambdas[indices]
w = w[:, indices]
var_explained = lambdas / lambdas.sum()
for m in range(len(var_explained)):
if var_explained[m] < 0.1*var_explained[0]:
break
m += 1
cum_var_explained = var_explained.cumsum()
n_PC = np.searchsorted(cum_var_explained, 0.8)
X = TruncatedSVD(n_components=n_PC).fit_transform(bow)
X_m = TruncatedSVD(n_components=m).fit_transform(bow)
feats = tfidf_vectorizer.get_feature_names()
files_ = {'bow':bow, 'feats':feats, 'posts':posts,
'trunc_elbow':(m, X_m), 'trunc_0.8':(n_PC, X)}
pickle.dump(files_, open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{trial_no}', 'wb'))
However, due to the random nature of sampling, robustness of the sampling must be checked. In particular, the features and their weights were tested as to whether the results are similar for the different trials. This was done for 20 trials. Two validations are performed.
T = []
for t in range(20):
T.append(pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{t}', 'rb')))
wgts_base = []
feats_base = []
for t in T:
wgts_base.extend(list(t['bow'].toarray().sum(axis=0)))
feats_base.extend(t['feats'])
top_idx = np.argsort(np.array(wgts_base))[::-1]
top_fts = list(zip(np.array(feats_base)[top_idx], np.array(wgts_base)[top_idx]))
top_fts = list(dict(top_fts).keys())[:10]
print(top_fts)
fig, ax = plt.subplots(1, 2, figsize=(16,6))
df_feat = pd.DataFrame()
for i in top_fts:
fts_wgt = []
for t in T:
try:
fts_idx = t['feats'].index(i)
fts_wgt.append(t['bow'].toarray()[:,fts_idx].sum())
except:
fts_wgt.append(0)
df_feat[i] = fts_wgt
sns.boxplot(data=df_feat, ax=ax[0])
ax[0].set_ylabel("Sum of TF-IDF weights")
ax[0].set_xlabel("Top features")
def jaccard(A, B, feats):
"""A, B dict"""
jacc_i = 0
for i in feats:
if A[i] > 0:
jacc_i += np.min([A[i], B.get(i, 0)]) / np.max([A[i], B.get(i, 0)])
else:
print(i)
jacc_i /= len(feats)
return jacc_i
jac_sim = np.zeros((20, 20))
for i in range(20):
data_0 = pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{i}', 'rb'))
bow0 = data_0['bow']
wgt_0 = bow0.sum(axis=0)
feats_0 = data_0['feats']
A = dict(zip(feats_0, np.array(wgt_0).reshape(wgt_0.shape[1])))
for n in range(i, 20):
data_n = pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{n}', 'rb'))
bown = data_n['bow']
wgt_n = bown.sum(axis=0)
feats_n = data_n['feats']
B = dict(zip(feats_n, np.array(wgt_n).reshape(wgt_n.shape[1])))
jac_sim[i, n] = jaccard(A, B, feats_0)
im = ax[1].imshow(jac_sim, cmap='Blues')
# ax[1].axis('off')
ax[1].set_xlabel("Sample")
ax[1].set_ylabel("Sample")
ax[1].set_xticks([i for i in range(0, 21, 2)])
ax[1].set_yticks([i for i in range(0, 21, 2)])
ax[1].set_xlim(0,19)
ax[1].set_ylim(0,19)
ax[1].set_xticklabels([i+1 for i in range(0, 20, 2)])
ax[1].set_yticklabels([i+1 for i in range(0, 20, 2)])
plt.colorbar(im, ax=ax[1])
fig.savefig('robustness.png', dpi=300)
Both observations indicate high similarity of the sampled corpora.

The dataset has 5 numerical features:
X_0 = pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T3', 'rb'))
df_X0 = pd.DataFrame(X_0['posts'], columns = ['Id', 'PostTypeId', 'AcceptedAnswerId',
'CreationDate', 'Score', 'ViewCount', 'Body', 'OwnerUserId',
'LastEditorUserId', 'LastEditDate', 'LastActivityDate',
'Title', 'Tags', 'AnswerCount', 'CommentCount',
'FavoriteCount', 'CommunityOwnedDate'])
X0_num = df_X0[['Score','ViewCount','AnswerCount','CommentCount','FavoriteCount']].fillna(0).astype(int)
sns.pairplot(X0_num)
sns.distplot(X0_num['Score'].apply(lambda x: np.log(x+1-X0_num['Score'].min())))
sns.distplot(X0_num['ViewCount'].apply(lambda x: np.log(x+1)))
Looking at the scores and viewcounts, most of the data are in tens or hundreds. There are a few, however, that are orders of magnitude higher than these.
Based on the results of the EDA, log-scaling is appropriate for the given data.
X0_num['Score'] = X0_num['Score'].apply(lambda x: np.log(x+1-X0_num['Score'].min()))
X0_num['ViewCount'] = X0_num['ViewCount'].apply(lambda x: np.log(x+1))
Xnum_data = MinMaxScaler().fit_transform(X0_num.to_numpy())
Xnum_data
Now that we have our design matrix, dimensionality reduction should be used to observe the effect of the number of principal components necessary.
V = np.cov(Xnum_data, rowvar=False)
lambdas, w = np.linalg.eig(V)
indices = np.argsort(lambdas)[::-1]
lambdas = lambdas[indices]
w = w[:, indices]
var_explained = lambdas / lambdas.sum()
fig, ax = plt.subplots(1, 2, figsize=(16,5))
ax[0].plot(np.arange(1, len(var_explained)+1), var_explained, lw=3)
ax[0].set_xlabel('Number of features', size=14)
ax[0].set_ylabel('Variance explained', size=14);
cum_var_explained = var_explained.cumsum()
# fig, ax = plt.subplots(figsize=(12,6))
n_PC = np.searchsorted(cum_var_explained, 0.8)
ax[1].plot(np.arange(1, len(cum_var_explained)+1),
cum_var_explained, '-', zorder=1, lw=3)
ax[1].set_ylim(ymin=0)
#ax[1].plot([-50, n_PC], [0.8, 0.8], 'r--')
# ax[1].plot([n_PC, n_PC], [0, 0.8], 'r--')
# ax[1].scatter(n_PC, 0.8, c='r', zorder=2)
#ax[1].set_xlim(-50,)
ax[1].set_xlabel('Features', size=14)
ax[1].set_ylabel('Cumulative variance explained', size=14);
rate = (var_explained[:-1] - var_explained[1:])/var_explained[:-1]
for m in range(len(rate)):
if rate[m] < 0.001:
break
m += 1
# ax[0].axvline(m, c='r');
# print(m)
Using the standard of having $80%$ of the variance captured, the design matrix will be truncated into 2 components.
Xnum_trunc = TruncatedSVD(n_components=2).fit_transform(Xnum_data)
For this analysis, K-Means is used to cluster the stackoverflow questions based on their numerical features and further verify it through internal validations
def intra_to_inter(X, y, dist, r):
p_list = []
q_list = []
z = np.random.choice(X.shape[0], size=(r, 2))
for i, j in z:
if i == j:
continue
elif y[i] == y[j]:
p_list.append(dist(X[i], X[j]))
else:
q_list.append(dist(X[i], X[j]))
return (np.average(p_list) / np.average(q_list))
def cluster_range(X, clusterer, k_start, k_stop):
# YOUR CODE HERE
ys = []
inertias = []
chs = []
scs = []
iidrs=[]
for k in range(k_start, k_stop+1):
print('Executing k=', k)
kmeans_X = KMeans(n_clusters=k, random_state=clusterer.random_state);
y_predict_X = kmeans_X.fit_predict(X)
ys.append(y_predict_X)
inertias.append(kmeans_X.inertia_);
iidrs.append(intra_to_inter(X, y_predict_X, euclidean, 50))
chs.append(calinski_harabaz_score(X, y_predict_X));
scs.append(silhouette_score(X, y_predict_X));
cluster = {}
cluster['ys'] = ys
cluster['inertias'] = inertias
cluster['chs'] = chs
cluster['iidrs'] = iidrs
cluster['scs'] = scs
return cluster
k_start, k_stop = (2, 10)
kmeans_cluster = cluster_range(Xnum_data, KMeans(random_state=1337),
k_start=k_start, k_stop=k_stop)
fig, ax = plt.subplots(1,5, figsize=(16,3))
ax[0].plot(range(k_start, k_stop+1), kmeans_cluster['inertias'], 'o--')
ax[0].set_xlabel('k')
ax[0].set_title('inertias');
ax[0].grid(which='both')
ax[1].plot(range(k_start, k_stop+1), kmeans_cluster['iidrs'], 'o--')
ax[1].set_xlabel('k')
ax[1].set_title('intra-to-inter');
ax[1].grid(which='both')
ax[2].plot(range(k_start, k_stop+1), kmeans_cluster['chs'], 'o--')
ax[2].set_xlabel('k')
ax[2].set_title('calinski-harabaz score');
ax[2].grid(which='both')
ax[3].plot(range(k_start, k_stop+1), kmeans_cluster['scs'], 'o--')
ax[3].set_xlabel('k')
ax[3].set_title('silhouette score');
ax[3].grid(which='both')
k=5
ax[0].axvline(k, c='r', ls='--');
ax[1].axvline(k, c='r', ls='--');
ax[2].axvline(k, c='r', ls='--');
ax[3].axvline(k, c='r', ls='--');
sizes = []
for i in range(k_stop-k_start+1):
sizes.append(list(np.unique(kmeans_cluster['ys'][i], return_counts=True)[1]))
for i in range(len(sizes)):
j_prev = 0
colors = list(zip(*[np.linspace(0,0.8,len(sizes[i])+1) for _ in range(3)]))
n = 0
for j in sizes[i]:
ax[4].bar(i+1, j, width=0.6, bottom=j_prev, color=colors[n])
j_prev += j
n+=1
ax[4].set_title('balance measure')
fig.savefig('internal_valid.png', dpi=300)
From the internal validations, taking into account the tradeoffs since we want low inertias, low intra-to-inter, high calinski-harabasz, and high silhouette, $k=5$ is chosen.
path_ = '/mnt/processed/private/msds2020/lt13/pickle/james/'
Xnum_trunc = pickle.load(open(path_+'Xnum_trunc', 'rb'))
w = pickle.load(open(path_+'Xnum_w', 'rb'))
y = pickle.load(open(path_+'Xnum_y', 'rb'))
X_tsne0 = pickle.load(open(path_+'X3_bow_ts0', 'rb'))
y_0 = pickle.load(open(path_+'y_kmeans0', 'rb'))
X_tsne1 = pickle.load(open(path_+'X3_bow_ts1', 'rb'))
y_1 = pickle.load(open(path_+'y_kmeans1', 'rb'))
X_tsne2 = pickle.load(open(path_+'X3_bow_ts2', 'rb'))
y_2 = pickle.load(open(path_+'y_kmeans2', 'rb'))
X_tsne3 = pickle.load(open(path_+'X3_bow_ts3', 'rb'))
y_3 = pickle.load(open(path_+'y_kmeans3', 'rb'))
X_tsne4 = pickle.load(open(path_+'X3_bow_ts4', 'rb'))
y_4 = pickle.load(open(path_+'y_kmeans4', 'rb'))
plt.figure(figsize=(9,15), dpi=200)
plt.suptitle('Posts Measures and Themes Clustering')
gs = gridspec.GridSpec(5, 3)
ax1 = plt.subplot(gs[:-2,:])
ax1.scatter(Xnum_trunc[:,0], -Xnum_trunc[:,1], alpha=0.8, c=y);
ax1.set_axis_off();
for feat, feat_name in zip(w, ['Score(log)','ViewCount(log)','AnswerCount','CommentCount','FavoriteCount']):
ax1.arrow(0.2, -0.4, .5*feat[0], .5*feat[1], color='k', width=0.003, ec='none')
if not feat_name.endswith('(log)'):
ax1.text(0.22+.5*feat[0], -0.38+.5*feat[1], feat_name, ha='center', color='k')
else:
ax1.text(0.3+.5*feat[0], -0.4+.5*feat[1], feat_name, ha='center', color='k')
ax2 = plt.subplot(gs[3,0])
ax2.scatter(X_tsne0[:,0], X_tsne0[:,1], alpha=0.8, c=y_0, cmap='Purples');
ax2.annotate(f'{len(y_0)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax2.set_title('Mod Views, High Eng')
ax2.set_axis_off();
ax3 = plt.subplot(gs[3,1])
ax3.scatter(X_tsne1[:,0], X_tsne1[:,1], alpha=0.8, c=y_1, cmap='Blues');
ax3.annotate(f'{len(y_1)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax3.set_title('Mod Views, Low Eng')
ax3.set_axis_off();
ax4 = plt.subplot(gs[3,2])
ax4.scatter(X_tsne2[:,0], X_tsne2[:,1], alpha=0.8, c=y_2, cmap='BuGn');
ax4.annotate(f'{len(y_2)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax4.set_title('Low Views, Low Eng')
ax4.set_axis_off();
ax5 = plt.subplot(gs[4,0])
ax5.scatter(X_tsne3[:,0], X_tsne3[:,1], alpha=0.8, c=y_3, cmap='Greens');
ax5.annotate(f'{len(y_3)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax5.set_title('Low Views, High Eng')
ax5.set_axis_off();
ax6 = plt.subplot(gs[4,1])
ax6.scatter(X_tsne4[:,0], X_tsne4[:,1], alpha=0.8, c=y_4, cmap='YlOrRd');
ax6.annotate(f'{len(y_4)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax6.set_title('High Views, High Eng')
ax6.set_axis_off();
fig.savefig('fig1.png', dpi=500)
Looking at the five clusters and the features for each cluster, each cluster were classified according to their viewership and engagement levels.

For each of the five clusters from the numerical features, the titles and tags in the stackoverflow post questions were used as features to identify the prominent topics for each cluster.
Since words have no concept of distance on their own, the posts must be vectorized. For this, TF-IDF is used. The post titles and post tags were separately vectorized.
For both titles and tags, a token pattern of \w{2,} is used, classifying words as those that have atleast two continuous word-characters in them. English stopwords were used from nltk. Moreover, the word python is redundant since the questions are already filtered to python posts. using and use are filtered since they are words that are frequent but not relevant.
The minimum document frequency (the percentage of documents a word must appear in for it to be considered a feature) is 0.001 for the titles and 0.01 for the tags.
stop_words = stopwords.words('english') + ['python', 'using', 'use']
tfidf_vectorizer = TfidfVectorizer(token_pattern='\w{2,}',
stop_words=stop_words,
binary=True,
min_df=0.001)
X3_bow = tfidf_vectorizer.fit_transform(df_X0['Title'][kmeans_cluster['ys'][3]==4])
tfidf_vectorizer_t = TfidfVectorizer(token_pattern='\w{2,}',
stop_words=stop_words,
binary=True,
min_df=0.01)
X3_bow_t = tfidf_vectorizer_t.fit_transform(df_X0['Tags'][kmeans_cluster['ys'][3]==4])
X3_data = np.concatenate((X3_bow.toarray(), X3_bow_t.toarray()), axis=1)
V = np.cov(X3_data, rowvar=False)
lambdas, w = np.linalg.eig(V)
indices = np.argsort(lambdas)[::-1]
lambdas = lambdas[indices]
w = w[:, indices]
var_explained = lambdas / lambdas.sum()
fig, ax = plt.subplots(1, 2, figsize=(16,5))
ax[0].plot(np.arange(1, len(var_explained)+1), var_explained, lw=3)
ax[0].set_xlabel('Number of features', size=14)
ax[0].set_ylabel('Variance explained', size=14);
cum_var_explained = var_explained.cumsum()
# fig, ax = plt.subplots(figsize=(12,6))
n_PC = np.searchsorted(cum_var_explained, 0.8)
ax[1].plot(np.arange(1, len(cum_var_explained)+1),
cum_var_explained, '-', zorder=1, lw=3)
ax[1].set_ylim(ymin=0)
#ax[1].plot([-50, n_PC], [0.8, 0.8], 'r--')
ax[1].plot([n_PC, n_PC], [0, 0.8], 'r--')
ax[1].scatter(n_PC, 0.8, c='r', zorder=2)
#ax[1].set_xlim(-50,)
ax[1].set_xlabel('Features', size=14)
ax[1].set_ylabel('Cumulative variance explained', size=14);
rate = (var_explained[:-1] - var_explained[1:])/var_explained[:-1]
for m in range(len(rate)):
if rate[m] < 0.001:
break
m += 1
ax[0].axvline(m, c='r');
X3_bow_n = TruncatedSVD(n_components=n_PC, random_state=1337).fit_transform(X3_data)
k_start, k_stop = (2, 11)
X3_kmeans_cluster = cluster_range(X3_bow_n, KMeans(random_state=1337),
k_start=k_start, k_stop=k_stop)
fig, ax = plt.subplots(1,5, figsize=(16,3))
ax[0].plot(range(k_start, k_stop+1), X3_kmeans_cluster['inertias'], 'o--')
ax[0].set_xlabel('k')
ax[0].set_title('inertias');
ax[0].grid(which='both')
ax[1].plot(range(k_start, k_stop+1), X3_kmeans_cluster['iidrs'], 'o--')
ax[1].set_xlabel('k')
ax[1].set_title('intra-to-inter');
ax[1].grid(which='both')
ax[2].plot(range(k_start, k_stop+1), X3_kmeans_cluster['chs'], 'o--')
ax[2].set_xlabel('k')
ax[2].set_title('calinski-harabaz score');
ax[2].grid(which='both')
ax[3].plot(range(k_start, k_stop+1), X3_kmeans_cluster['scs'], 'o--')
ax[3].set_xlabel('k')
ax[3].set_title('silhouette score');
ax[3].grid(which='both')
k=6
ax[0].axvline(k, c='r', ls='--');
ax[1].axvline(k, c='r', ls='--');
ax[2].axvline(k, c='r', ls='--');
ax[3].axvline(k, c='r', ls='--');
sizes = []
for i in range(k_stop-k_start+1):
sizes.append(list(np.unique(X3_kmeans_cluster['ys'][i], return_counts=True)[1]))
for i in range(len(sizes)):
j_prev = 0
colors = list(zip(*[np.linspace(0,0.8,len(sizes[i])+1) for _ in range(3)]))
n = 0
for j in sizes[i]:
ax[4].bar(i+1, j, width=0.6, bottom=j_prev, color=colors[n])
j_prev += j
n+=1
ax[4].set_title('cluster nos');
from wordcloud import WordCloud
k = 5
y_kmeans = X3_kmeans_cluster['ys'][k-k_start]
fig, ax = plt.subplots(1,5, figsize=(16,3))
for i in range(k):
docs = X3_data[np.where(y_kmeans==i)[0]]
dom_feat = np.argsort(docs.sum(axis=0))[::-1][:50]
feat_wgt = docs.sum(axis=0)[dom_feat]
dom_feat = np.array(tfidf_vectorizer.get_feature_names()
+ tfidf_vectorizer_t.get_feature_names())[dom_feat]
word_cld = {i[0]:int(i[1])+1 for i in zip(dom_feat, feat_wgt)}
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
collocations = False).generate_from_frequencies(word_cld)
ax[i].imshow(wordcloud)
ax[i].axis('off')

wordcloud0 = pickle.load(open(path_+'wordcloud0', 'rb'))
wordcloud1 = pickle.load(open(path_+'wordcloud1', 'rb'))
wordcloud2 = pickle.load(open(path_+'wordcloud2', 'rb'))
wordcloud3 = pickle.load(open(path_+'wordcloud3', 'rb'))
wordcloud4 = pickle.load(open(path_+'wordcloud4', 'rb'))
fig, ax = plt.subplots(10, 6, figsize=(15,25), sharey=False, dpi=200)
plt.subplots_adjust(wspace=0, hspace=0)
for i in range(10):
ax[i,0].set_axis_off();
for j in range(1,6):
ax[i,j].spines['top'].set_color('Grey')
ax[i,j].spines['left'].set_color('Grey')
ax[i,j].spines['bottom'].set_color('Grey')
ax[i,j].spines['right'].set_color('Grey')
ax[i,j].set_xticks([])
ax[i,j].set_yticks([])
ax[0,1].set_title('$Hot$ $Posts$\nHi View, Hi Eng');
ax[0,2].set_title('$Trending$\nMod View, Hi Eng');
ax[0,3].set_title('$S.O.S$\nMod View, Low Eng');
ax[0,4].set_title('$Curious$ $Topics$\nLow View, Hi Eng');
ax[0,5].set_title('$Spam$\nLow View, Low Eng');
ax[0,1].imshow(wordcloud4[0], aspect='auto');
ax[9,1].imshow(wordcloud4[1], aspect='auto');
ax[1,1].imshow(wordcloud4[2], aspect='auto');
ax[5,1].imshow(wordcloud4[3], aspect='auto');
ax[6,1].imshow(wordcloud4[4], aspect='auto');
cmap_g = LinearSegmentedColormap.from_list('mycmap', ['#11644D', '#A0B046'])
ax[3,4].imshow(wordcloud3[0].recolor(colormap=cmap_g), aspect='auto');
ax[2,4].imshow(wordcloud3[1].recolor(colormap=cmap_g), aspect='auto');
ax[9,4].imshow(wordcloud3[2].recolor(colormap=cmap_g), aspect='auto');
ax[5,4].imshow(wordcloud3[3].recolor(colormap=cmap_g), aspect='auto');
ax[1,4].imshow(wordcloud3[4].recolor(colormap=cmap_g), aspect='auto');
ax[4,4].imshow(wordcloud3[5].recolor(colormap=cmap_g), aspect='auto');
cmap_p = LinearSegmentedColormap.from_list('mycmap', ['#DDA0DD', '#4B0082'])
cmap_b = LinearSegmentedColormap.from_list('mycmap', ['#107FC9', '#0B108C'])
cmap_bg = LinearSegmentedColormap.from_list('mycmap', ['#20B2AA', '#008080'])
ax[0,2].imshow(wordcloud0[0].recolor(colormap=cmap_p), aspect='auto');
ax[5,2].imshow(wordcloud0[1].recolor(colormap=cmap_p), aspect='auto');
ax[1,2].imshow(wordcloud0[2].recolor(colormap=cmap_p), aspect='auto');
ax[6,2].imshow(wordcloud0[3].recolor(colormap=cmap_p), aspect='auto');
ax[9,2].imshow(wordcloud0[4].recolor(colormap=cmap_p), aspect='auto');
ax[2,2].imshow(wordcloud0[5].recolor(colormap=cmap_p), aspect='auto');
ax[2,3].imshow(wordcloud1[0].recolor(colormap=cmap_b), aspect='auto');
ax[7,3].imshow(wordcloud1[1].recolor(colormap=cmap_b), aspect='auto');
ax[5,3].imshow(wordcloud1[2].recolor(colormap=cmap_b), aspect='auto');
ax[9,3].imshow(wordcloud1[3].recolor(colormap=cmap_b), aspect='auto');
ax[8,3].imshow(wordcloud1[4].recolor(colormap=cmap_b), aspect='auto');
ax[0,3].imshow(wordcloud1[5].recolor(colormap=cmap_b), aspect='auto');
ax[1,3].imshow(wordcloud1[6].recolor(colormap=cmap_b), aspect='auto');
ax[6,5].imshow(wordcloud2[0].recolor(colormap=cmap_bg), aspect='auto');
ax[0,5].imshow(wordcloud2[1].recolor(colormap=cmap_bg), aspect='auto');
ax[9,5].imshow(wordcloud2[2].recolor(colormap=cmap_bg), aspect='auto');
ax[5,5].imshow(wordcloud2[3].recolor(colormap=cmap_bg), aspect='auto');
ax[3,5].imshow(wordcloud2[4].recolor(colormap=cmap_bg), aspect='auto');
ax[0,0].annotate('$Pandas$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[1,0].annotate('$Numpy$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[2,0].annotate('$String$\n$Processing$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[3,0].annotate('$Lists$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[4,0].annotate('$Classes$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[5,0].annotate('$Django$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[6,0].annotate('$Matplotlib$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[7,0].annotate('$Web$\n$Interfaces$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[8,0].annotate('$Web$\n$Engines$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[9,0].annotate('$Mix$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
Looking at the non-numerical clusters for the different numerical clusters. There are some differences, however, for the features in the cluster for the different numerical clusters. For example for the numpy clusters, the Trending, S.O.S, and Curious Topics also have scipy along with numpy. However, for the Hot posts, the scipy feature is not visible. This can be attributed to the target of scipy which is the scientific community, a smaller proportion of the programming community.


